Github repository: https://github.com/GreatLearningAIML1/gl-pgp-aiml-uta-intl-aug20-adris-misra
Project - Term Deposit Sale¶Goal:¶Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
Domain:¶Resources Available¶The historical data for this project is available in file https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
Objective:¶The classification goal is to predict the likelihood of a liability customer buying personal loans.
Learning Outcomes:¶Attribute Information:¶Univariate analysis (12 marks)
a. Univariate analysis – data types and description of the independent attributes which should include (name, meaning, range of values observed, central values (mean and median), standard deviation and quartiles, analysis of the body of distributions / tails, missing values, outliers.
b. Strategies to address the different data challenges such as data pollution, outlier’s treatment and missing values treatment.
c. Please provide comments in jupyter notebook regarding the steps you take and insights drawn from the plots.
Multivariate analysis (8 marks)
a. Bi-variate analysis between the predictor variables and target column. Comment on your findings in terms of their relationship and degree of relation if any. Visualize the analysis using boxplots and pair plots, histograms or density curves. Select the most appropriate attributes.
b. Please provide comments in jupyter notebook regarding the steps you take and insights drawn from the plots
# Import warnings to suppress runtime warnings
import warnings
warnings.filterwarnings('ignore')
# Import basic libraries for data and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline
# Import models and relevent class / functions
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.feature_extraction.text import CountVectorizer #DT does not take strings as input for the model fit step....
# Import Graph Visualization models
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
# Model interpreter
import lime
import lime.lime_tabular
from os import system
# Import data
cust = pd.read_csv("bank-full.csv")
# checking sample head data
cust.head()
# checking sample tail data
cust.tail()
# shape of dataframe
cust.shape
cust.info()
# Check for Missing Values
cust.isnull().sum()
cust.describe().T
Preliminary data analysis:
# Number of unique in each column
cust.nunique()
# Separate out numerical columns vs categorical columns
num_col = list(cust._get_numeric_data().columns)
cat_col = list(set(cust.columns) - set(num_col))
print("Numerical columns: ",num_col)
print("Categorical columns:", cat_col)
# Value counts of all categorical columns
for col in cat_col:
print('*********',col,'***********')
df=pd.concat([cust[col].value_counts(),cust[col].value_counts(normalize=True)*100],axis=1)
df=df.reset_index()
df.columns=[col,'count','frequency%']
print(df)
print()
# categorical columns count plot
for col in cat_col:
unq= cust[col].nunique()
plt.figure(figsize=(unq*1.5,5));
sns.countplot(x=cust[col], hue=cust['Target']);
plt.xlabel(col, size=20);
plt.ylabel('Count', size=20);
plt.xticks(rotation=90, size=15);
plt.yticks(size=15);
plt.show()
# Check for Skew for all variables
cust.skew().sort_values(ascending=False)
# Check for Kurt for all variables
cust.kurt().sort_values(ascending=False)
# continues columns plot
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
for col in num_col:
fig, [ax1,ax2] = plt.subplots(nrows=1,ncols=2,figsize = (16,5));
sns.distplot(x=cust[cust['Target']=="no"][col], kde=True, ax=ax1);
ax1.set_title('Target = "no"', fontsize=18);
ax1.set_xlabel(col, size=20);
ax1.set_ylabel('Density', size=20);
sns.distplot(x=cust[cust['Target']=="yes"][col], kde=True, ax=ax2);
ax2.set_title('Target = "yes"', fontsize=18)
ax2.set_xlabel(col, size=20);
ax2.set_ylabel('Density', size=20);
fig.tight_layout();
plt.show();
plt.rcParams.update(plt.rcParamsDefault)
# Boxplot to further see outliers
for col in num_col:
sns.boxplot(y=cust[col], x=cust['Target'])
plt.xlabel('Target', size=15);
plt.ylabel(col, size=15);
plt.show()
Observation:
# pairplot
sns.set_context(rc={"axes.labelsize":30})
sns.pairplot(cust, hue='Target',diag_kind ='kde', height = 4);
plt.rcParams.update(plt.rcParamsDefault)
Observation:
# day vs month pivot for Target=yes
pd.pivot_table(data=cust[cust['Target']=='yes'], index='day', columns='month', values='Target', aggfunc='count')
# day vs month pivot for Target=yes
pd.pivot_table(data=cust[cust['Target']=='no'], index='day', columns='month', values='Target', aggfunc='count')
Observation:
cust_encoded = cust.copy()
for col in ['default','housing','loan','Target']:
cust_encoded[col] = cust_encoded[col].map(dict(yes=1, no=0))
cust_encoded
# pairplot
sns.set_context(rc={"axes.labelsize":30})
sns.pairplot(cust_encoded, diag_kind ='kde', height = 4);
plt.rcParams.update(plt.rcParamsDefault)
# correlation matrix
plt.figure(figsize=(20,18))
sns.heatmap(cust_encoded.corr(), annot=True, annot_kws={"size":18});
plt.xticks(rotation=90, size=12, color='blue');
plt.yticks(rotation=0, size=12, color='blue');
Observation:
cust_encoded.info()
# Convert the columns with an 'object' datatype into categorical variables
for col in cust_encoded.columns: # Loop through all columns in the dataframe
if cust_encoded[col].dtype == 'object': # Only apply for columns with categorical strings
cust_encoded[col] = pd.Categorical(cust_encoded[col])# Replace strings with an integer
# checking sample head data
cust_encoded.head(10)
cust_encoded.info()
# Defining independent and dependent variable
X = cust_encoded.drop(['Target'],axis=1)
y = cust_encoded['Target']
# convert categorical variables to dummy variables
X = pd.get_dummies(X)
X.columns
X.T
# Split the data into training and test set in the ratio of 70:30 respectively
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=7)
# checking if distribution between train and test are similar to the original sample
print('y:\n', y.value_counts(normalize=True)*100)
print('\n\ny_train:\n', y_train.value_counts(normalize=True)*100)
print('\n\ny_test:\n', y_test.value_counts(normalize=True)*100)
# creating Logistic Regression model and fit the model on train data
logreg = LogisticRegression(max_iter=100,random_state=7)
logreg.fit(X_train, y_train)
# Predicting the target variable on test data
y_predict = logreg.predict(X_test)
# confusion matrix
cm= confusion_matrix(y_test,y_predict)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='.0f', annot_kws={"size":15})
plt.ylabel('Observed', size=15)
plt.xlabel('Predicted', size=15);
plt.xticks(size=25, color='blue');
plt.yticks(size=25, color='blue');
# Trainiing accuracy
logreg.score(X_train,y_train)
# Test accuracy
logreg.score(X_test,y_test)
# print different performance metrices
print("Precision: {}\nRecall: {}\nAccuracy: {}\nF1 Score: {}\nROC AUC Score: {}".format(precision_score(y_test,y_predict),\
recall_score(y_test,y_predict), accuracy_score(y_test,y_predict), f1_score(y_test,y_predict), roc_auc_score(y_test,y_predict)))
#AUC ROC curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1],pos_label=1)
plt.figure(figsize=(10,8))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.0])
plt.ylim([-0.01, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
**Observation:**Logistic regression
As bank wants to convert more people to personal loan TP should be high and FN should be low, so recall is important metric here. In Logistics regression we see recall value is 0.26. So model is not performing as per business need.
# Model comparison DataFrame creation
dfModelComp = pd.DataFrame(columns=['model','precision','recall','accuracy','F1 score','roc auc score'])
dfModelComp
# insert logistics performance data
model_perf = {'model':'Logistic regression', 'precision':precision_score(y_test,y_predict), 'recall':\
recall_score(y_test,y_predict), 'accuracy':accuracy_score(y_test,y_predict), 'F1 score':\
f1_score(y_test,y_predict), 'roc auc score':roc_auc_score(y_test,y_predict)}
dfModelComp = dfModelComp.append(model_perf, ignore_index=True)
dfModelComp
dTree = DecisionTreeClassifier(criterion = 'entropy', random_state=7)
dTree.fit(X_train, y_train)
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
**Observation:**Decision Tree
Compared to training data test score is much low as decision tree works on greedy algorithm. So this tree is highly overfitted. We need to restrict the growth of the tree by prunning.
# By doing trial method at max_depth = 7 Tree gives a balanced score between training and testing.
dTreeP = DecisionTreeClassifier(criterion = 'gini', max_depth =7 , random_state=7)
dTreeP.fit(X_train, y_train)
# score from pruned tree
print(dTreeP.score(X_train, y_train))
print(dTreeP.score(X_test, y_test))
# predicting output of pruned tree
y_predict = dTreeP.predict(X_test)
# confusion matrix of pruned tree
cm= confusion_matrix(y_test,y_predict)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='.0f', annot_kws={"size":15})
plt.ylabel('Observed', size=15)
plt.xlabel('Predicted', size=15);
plt.xticks(size=25, color='blue');
plt.yticks(size=25, color='blue');
# performance metrices for pruned tree
print("Precision: {}\nRecall: {}\nAccuracy: {}\nF1 Score: {}\nROC AUC Score: {}".format(precision_score(y_test,y_predict),\
recall_score(y_test,y_predict), accuracy_score(y_test,y_predict), f1_score(y_test,y_predict), roc_auc_score(y_test,y_predict)))
#AUC ROC curve
dTreeP_roc_auc = roc_auc_score(y_test, dTreeP.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, dTreeP.predict_proba(X_test)[:,1])
plt.figure(figsize=(10,8))
plt.plot(fpr, tpr, label='Decision Tree (area = %0.2f)' % dTreeP_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.0])
plt.ylim([-0.01, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
# grid search method
dTree = DecisionTreeClassifier(random_state = 7)
grid_values = {'criterion' : ['gini', 'entropy'],
'splitter' : ['best', 'random'],
'max_depth' : np.arange(5,9),
'min_samples_leaf' : np.arange(5,25)
}
dTreeGV = GridSearchCV(dTree, param_grid = grid_values)
dTreeGV.fit(X_train, y_train)
#Predict values based on Grid search parameters
y_predictGV= dTreeGV.predict(X_test)
# Grid Model Evaluation metrics
print('Evaluation Score : ' + str(dTreeGV.score(X_test,y_test)))
print('Best Parameters : ' + str(dTreeGV.best_params_))
# confusion matrix from Grid search
cm= confusion_matrix(y_test,y_predictGV)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='.0f', annot_kws={"size":15})
plt.ylabel('Observed', size=15)
plt.xlabel('Predicted', size=15);
plt.xticks(size=25, color='blue');
plt.yticks(size=25, color='blue');
# performance metrices for Grid Serch tree
print("Precision: {}\nRecall: {}\nAccuracy: {}\nF1 Score: {}\nROC AUC Score: {}".format(precision_score(y_test,y_predictGV),\
recall_score(y_test,y_predictGV), accuracy_score(y_test,y_predictGV), f1_score(y_test,y_predictGV), \
roc_auc_score(y_test,y_predictGV)))
**Observation:**Decision Tree performance
# Insert pruned tree performance data
model_perf = {'model':'Decision Tree pruned', 'precision':precision_score(y_test,y_predict), 'recall':\
recall_score(y_test,y_predict), 'accuracy':accuracy_score(y_test,y_predict), 'F1 score':\
f1_score(y_test,y_predict), 'roc auc score':roc_auc_score(y_test,y_predict)}
dfModelComp = dfModelComp.append(model_perf, ignore_index=True)
dfModelComp
# Insert grid search performance data
model_perf = {'model':'Decision Tree Best Param', 'precision':precision_score(y_test,y_predictGV), 'recall':\
recall_score(y_test,y_predictGV), 'accuracy':accuracy_score(y_test,y_predictGV), 'F1 score':\
f1_score(y_test,y_predictGV), 'roc auc score':roc_auc_score(y_test,y_predictGV)}
dfModelComp = dfModelComp.append(model_perf, ignore_index=True)
dfModelComp
## Calculating feature importance used for interpretation
feat_importance = dTreeP.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(X_train.columns, dTreeP.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
# Bagging
bgcl = BaggingClassifier(n_estimators=100 ,random_state=7)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
print(bgcl.score(X_test , y_test))
# confusion matrix from Bagging
cm= confusion_matrix(y_test,y_predict)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='.0f', annot_kws={"size":15})
plt.ylabel('Observed', size=15)
plt.xlabel('Predicted', size=15);
plt.xticks(size=25, color='blue');
plt.yticks(size=25, color='blue');
# performance metrices for Bagging
print("Precision: {}\nRecall: {}\nAccuracy: {}\nF1 Score: {}\nROC AUC Score: {}".format(precision_score(y_test,y_predict),\
recall_score(y_test,y_predict), accuracy_score(y_test,y_predict), f1_score(y_test,y_predict), roc_auc_score(y_test,y_predict)))
# performance metrices for Bagging
model_perf = {'model':'Bagging', 'precision':precision_score(y_test,y_predict), 'recall':\
recall_score(y_test,y_predict), 'accuracy':accuracy_score(y_test,y_predict), 'F1 score':\
f1_score(y_test,y_predict), 'roc auc score':roc_auc_score(y_test,y_predict)}
dfModelComp = dfModelComp.append(model_perf, ignore_index=True)
dfModelComp
# Adaptive boosting
abcl = AdaBoostClassifier(n_estimators=250, random_state=7)
abcl = abcl.fit(X_train, y_train)
y_predict = abcl.predict(X_test)
print(abcl.score(X_test , y_test))
# confusion matrix for Adaptive boosting
cm= confusion_matrix(y_test,y_predict)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='.0f', annot_kws={"size":15})
plt.ylabel('Observed', size=15)
plt.xlabel('Predicted', size=15);
plt.xticks(size=25, color='blue');
plt.yticks(size=25, color='blue');
# performance metrices for Adaptive boosting
print("Precision: {}\nRecall: {}\nAccuracy: {}\nF1 Score: {}\nROC AUC Score: {}".format(precision_score(y_test,y_predict),\
recall_score(y_test,y_predict), accuracy_score(y_test,y_predict), f1_score(y_test,y_predict), roc_auc_score(y_test,y_predict)))
# Insert performance metrices for Adaptive boosting
model_perf = {'model':'Adaptive boosting', 'precision':precision_score(y_test,y_predict), 'recall':\
recall_score(y_test,y_predict), 'accuracy':accuracy_score(y_test,y_predict), 'F1 score':\
f1_score(y_test,y_predict), 'roc auc score':roc_auc_score(y_test,y_predict)}
dfModelComp = dfModelComp.append(model_perf, ignore_index=True)
dfModelComp
# Gradient Boosting
gbcl = GradientBoostingClassifier(n_estimators = 250, random_state=7)
gbcl = gbcl.fit(X_train, y_train)
y_predict = gbcl.predict(X_test)
print(gbcl.score(X_test, y_test))
# confusion matrix for Gradient boosting
cm= confusion_matrix(y_test,y_predict)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='.0f', annot_kws={"size":15})
plt.ylabel('Observed', size=15)
plt.xlabel('Predicted', size=15);
plt.xticks(size=25, color='blue');
plt.yticks(size=25, color='blue');
# performance metrices for Gradient boosting
print("Precision: {}\nRecall: {}\nAccuracy: {}\nF1 Score: {}\nROC AUC Score: {}".format(precision_score(y_test,y_predict),\
recall_score(y_test,y_predict), accuracy_score(y_test,y_predict), f1_score(y_test,y_predict), roc_auc_score(y_test,y_predict)))
# Insert performance metrices for Gradient boosting
model_perf = {'model':'Gradient boosting', 'precision':precision_score(y_test,y_predict), 'recall':\
recall_score(y_test,y_predict), 'accuracy':accuracy_score(y_test,y_predict), 'F1 score':\
f1_score(y_test,y_predict), 'roc auc score':roc_auc_score(y_test,y_predict)}
dfModelComp = dfModelComp.append(model_perf, ignore_index=True)
dfModelComp
# Random Forest
rfcl = RandomForestClassifier(n_estimators = 100, random_state=7,max_features=40)
rfcl = rfcl.fit(X_train, y_train)
y_predict = rfcl.predict(X_test)
print(rfcl.score(X_test, y_test))
# confusion matrix for Random Forest
cm= confusion_matrix(y_test,y_predict)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='.0f', annot_kws={"size":15})
plt.ylabel('Observed', size=15)
plt.xlabel('Predicted', size=15);
plt.xticks(size=25, color='blue');
plt.yticks(size=25, color='blue');
# performance metrices for Random Forest
print("Precision: {}\nRecall: {}\nAccuracy: {}\nF1 Score: {}\nROC AUC Score: {}".format(precision_score(y_test,y_predict),\
recall_score(y_test,y_predict), accuracy_score(y_test,y_predict), f1_score(y_test,y_predict), roc_auc_score(y_test,y_predict)))
# Insert performance metrices for Random Forest
model_perf = {'model':'Random Forest', 'precision':precision_score(y_test,y_predict), 'recall':\
recall_score(y_test,y_predict), 'accuracy':accuracy_score(y_test,y_predict), 'F1 score':\
f1_score(y_test,y_predict), 'roc auc score':roc_auc_score(y_test,y_predict)}
dfModelComp = dfModelComp.append(model_perf, ignore_index=True)
dfModelComp
**Observation:**
# Create Base Learners for Stacking
base_learners = [
('model_1', BaggingClassifier(n_estimators=100, random_state=7)),
('model_2', GradientBoostingClassifier(n_estimators = 250, random_state=7))
]
# Initialize Stacking Classifier
stackcl = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(random_state=7))
stackcl.fit(X_train, y_train)
y_predict = stackcl.predict(X_test)
print(stackcl.score(X_test, y_test))
# confusion matrix for Stacking
cm= confusion_matrix(y_test,y_predict)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='.0f', annot_kws={"size":15})
plt.ylabel('Observed', size=15)
plt.xlabel('Predicted', size=15);
plt.xticks(size=25, color='blue');
plt.yticks(size=25, color='blue');
# performance metrices for Stacking
print("Precision: {}\nRecall: {}\nAccuracy: {}\nF1 Score: {}\nROC AUC Score: {}".format(precision_score(y_test,y_predict),\
recall_score(y_test,y_predict), accuracy_score(y_test,y_predict), f1_score(y_test,y_predict), roc_auc_score(y_test,y_predict)))
# performance metrices for Stacking
model_perf = {'model':'Stacking', 'precision':precision_score(y_test,y_predict), 'recall':\
recall_score(y_test,y_predict), 'accuracy':accuracy_score(y_test,y_predict), 'F1 score':\
f1_score(y_test,y_predict), 'roc auc score':roc_auc_score(y_test,y_predict)}
dfModelComp = dfModelComp.append(model_perf, ignore_index=True)
dfModelComp
**Observation:** Using stacking mechanism with best models also provides similar performance. Considering the resource and time consumption of stacking it is not worth.